library(tidyverse)
library(lubridate)
library(imager)
library(stringi)
library(reshape)
train = read_csv("Training_data.csv", skip = 1)
# Fix variable names
names(train) = c("file", "time", "weather", "smoke", "pelicans", "pods", "npods", "disturbance", "stageofnesting", "preds", "abandon", "pretty")
# Remove appostrophe from file
train$file = substr(train$file,1,nchar(train$file)-1)
# Factor and clean factors
train[,c(2:6,8:12)] <- lapply(train[,c(2:6,8:12)], factor)
levels(train$weather) = c("cloud", "cloud", "cloud", "rain", "sun")
# Create a date variable for time series
train$date = substr(train$file,1,nchar(train$file)-4)
train$date = ymd_hms(train$date)
# Remove uninformative variables
train = train[,c(1:8,12,13)]
# View what's left
glimpse(train)
Observations: 1,011
Variables: 10
$ file <chr> "20170309002000.jpg", "20170309014000.jpg", "20170309091500.jpg", "20170309162000.jpg", "20170309184500...
$ time <fct> day, day, night, day, day, night, day, night, day, day, night, night, day, day, day, night, night, nigh...
$ weather <fct> sun, NA, NA, sun, sun, NA, cloud, NA, sun, cloud, NA, NA, sun, sun, sun, NA, NA, NA, sun, cloud, cloud,...
$ smoke <fct> FALSE, NA, NA, FALSE, FALSE, NA, FALSE, NA, FALSE, FALSE, NA, NA, FALSE, FALSE, FALSE, NA, NA, NA, FALS...
$ pelicans <fct> FALSE, NA, NA, FALSE, FALSE, NA, FALSE, NA, TRUE, TRUE, NA, NA, TRUE, TRUE, FALSE, NA, NA, NA, FALSE, F...
$ pods <fct> FALSE, NA, NA, FALSE, FALSE, NA, FALSE, NA, TRUE, TRUE, NA, NA, TRUE, TRUE, FALSE, NA, NA, NA, FALSE, F...
$ npods <int> NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, NA, NA, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ disturbance <fct> TRUE, NA, NA, FALSE, FALSE, NA, FALSE, NA, FALSE, FALSE, NA, NA, FALSE, FALSE, FALSE, NA, NA, NA, FALSE...
$ pretty <fct> NA, NA, NA, FALSE, FALSE, NA, FALSE, NA, FALSE, FALSE, NA, NA, FALSE, FALSE, FALSE, NA, NA, NA, FALSE, ...
$ date <dttm> 2017-03-09 00:20:00, 2017-03-09 01:40:00, 2017-03-09 09:15:00, 2017-03-09 16:20:00, 2017-03-09 18:45:0...
write_csv(train, "train_clean.csv")
Let’s take a look at when the pelicans were first seen on Lambourne Bay and when they left.
ggplot(train[!(is.na(train$pelicans)),], aes(x = date, y = pelicans)) +
geom_point() +
labs(title = "Pelican Sightings: 2017",
y = "Pelicans",
x = "Date")
We can see that the pelicans came to the bay around the start of April and left at around the start of October. Let’s get even more specific:
sightings = na.omit(train[train$pelicans == TRUE,]$date)
head(sightings, 10)
[1] "2017-03-11 14:50:00 UTC" "2017-03-11 23:55:00 UTC" "2017-03-12 14:00:00 UTC" "2017-03-12 16:40:00 UTC"
[5] "2017-03-30 23:00:00 UTC" "2017-03-31 02:10:00 UTC" "2017-03-31 13:25:00 UTC" "2017-03-31 15:00:00 UTC"
[9] "2017-04-01 00:10:00 UTC" "2017-04-01 14:05:00 UTC"
We can see that there were some sightings in mid March but the bulk of the sightings started to late on 3/30 and into early 3/31.
tail(sightings, 10)
[1] "2017-07-25 23:35:00 UTC" "2017-07-26 01:25:00 UTC" "2017-07-26 02:20:00 UTC" "2017-07-26 03:05:00 UTC"
[5] "2017-07-26 12:25:00 UTC" "2017-07-26 16:05:00 UTC" "2017-07-26 18:40:00 UTC" "2017-07-27 13:40:00 UTC"
[9] "2017-07-27 15:35:00 UTC" "2017-08-04 19:00:00 UTC"
We can see that there was a sighting on 8/4 but the bulk of the sightings stopped on 7/27.
These are just the sightings for Lambourne Bay so it’s conceivable that the birds were still on the island but had move from this bay specifically. Ultimately, I believe the time that the pelicans were on Lambourne Bay in 2017 were from 3/30 to 7/27.
Now let’s look at the number of pods on the bay over time and see how they changed.
ggplot(train, aes(x = date, y = npods)) +
geom_point(alpha = 0.25) +
geom_smooth(se = FALSE) +
scale_y_continuous(limits = c(0,10)) +
scale_x_datetime(date_breaks = "1 month", date_labels = "%b") +
labs(title = "Number of Pelican Pods: 2017",
x = "Date",
y = "Number of Pods")
We see that the number of pods increased in March and April from 0 pods to out 7 or 8. That number then declined to 1-3 by mid June. That number then stayed constant until the pelicans left Lambourne Bay.
pretty = subset(train, train$pretty == TRUE)
pimg = rep(NA, nrow(pretty))
for (i in 1:nrow(pretty)) {
pimg[i] = stri_replace_all_fixed(paste("PeliPhotos1Folder/", pretty$file[i], collapse = ""),
pattern = " ", replacement = "")
}
imlist = imlist(load.image(pimg[1]), load.image(pimg[2]), load.image(pimg[3]), load.image(pimg[4]),
load.image(pimg[5]), load.image(pimg[6]), load.image(pimg[7]), load.image(pimg[8]),
load.image(pimg[9]), load.image(pimg[10]), load.image(pimg[11]), load.image(pimg[12]),
load.image(pimg[13]), load.image(pimg[14]))
#imlist = as.data.frame(imlist)
for (i in 1:14) {
plot(imlist[i])
}
ggplot(train, aes(x = date, y = weather)) + geom_point()
smoke = subset(train, train$smoke == TRUE)
simg = rep(NA, nrow(smoke))
for (i in 1:nrow(smoke)) {
simg[i] = stri_replace_all_fixed(paste("PeliPhotos1Folder/", smoke$file[i], collapse = ""),
pattern = " ", replacement = "")
}
imlist = imlist(load.image(simg[1]), load.image(simg[2]))
JPEG decompression: Premature end of JPEG file
#imlist = as.data.frame(imlist)
for (i in 1:2) {
plot(imlist[i])
}
df = na.omit(train[train$disturbance == TRUE,c("date", "disturbance")])
ggplot(df, aes(x = date, y = disturbance)) +
geom_point(alpha = 0.5)
abandonment, stageofnesting and predators are all false or NA and are, therefore, of little use. Thus I removed them from the set for analysis in python.